home *** CD-ROM | disk | FTP | other *** search
/ Skunkware 5 / Skunkware 5.iso / src / X11 / wais / waisgate / irverify.c < prev    next >
C/C++ Source or Header  |  1995-05-09  |  15KB  |  535 lines

  1. /* WIDE AREA INFORMATION SERVER SOFTWARE:
  2.    No guarantees or restrictions.  See the readme file for the full standard
  3.    disclaimer.
  4.  
  5.    Brewster@think.com
  6. */
  7.  
  8. /* Change log:
  9.  * $Log:    irverify.c,v $
  10.  * Revision 1.6  92/04/01  09:57:36  morris
  11.  * fixed and eof check in readPostings
  12.  * 
  13.  * Revision 1.5  92/03/28  19:48:10  jonathan
  14.  * Fixed Log header.
  15.  * 
  16.  * Revision 1.4  92/02/18  15:36:32  morris
  17.  * made it faster
  18.  * 
  19.  * Revision 1.3  92/02/12  13:32:33  jonathan
  20.  * Added $Log so RCS will put the log message in the header
  21.  * 
  22. */
  23.  
  24. #include "irverify.h"
  25. #include "irfiles.h"
  26. #include "panic.h"
  27. #include "futil.h"
  28.  
  29. #define TEST_READ false
  30.  
  31. /*---------------------------------------------------------------------------*/
  32.  
  33. void
  34. printIndex (db)
  35. database* db;
  36. /* iterate over the index printing the contents */
  37. {
  38.   serialPostingFile* spf = NULL;
  39.   char indexFileName[MAX_FILE_NAME_LEN + 1];
  40.   postingsForATerm* posts = NULL;
  41.   
  42.   spf = initSerialPostingFile(index_filename(indexFileName,db));
  43.   
  44.   while ((posts = getPostingsForNextTerm(spf)) != NULL)
  45.    { printPostingsForATerm(posts);
  46.      /* XXX dispose of them */
  47.    }
  48.    
  49.   disposeSerialPostingFile(spf);
  50. }
  51.  
  52. /*---------------------------------------------------------------------------*/
  53.  
  54. static void 
  55. print_dictionary_block_and_index _AP((unsigned char* block,long size,serialPostingFile* spf));
  56.  
  57. static void 
  58. print_dictionary_block_and_index(block,size,spf)
  59. unsigned char *block;
  60. long size;
  61. serialPostingFile* spf;
  62. /* this prints the contents of a dictionary block */
  63. {
  64.   long i;
  65.   postingsForATerm* posts = NULL;
  66.   
  67.   for(i = 0; i < size; i++)
  68.    {
  69.      char *word = dictionary_block_word(i, block);
  70.      long pos = dictionary_block_position(i, block);
  71.      if(word[0] == '\0')
  72.        break;
  73.      printf("Entry %3ld: %21s %7ld\n", i, word,pos);
  74.      posts = getPostingsAt(spf,pos);
  75.      printPostingsForATerm(posts);
  76.      /* XXX dispose of them postings */
  77.    }
  78. }
  79.  
  80. /*---------------------------------------------------------------------------*/
  81.  
  82. extern long number_of_dictionary_blocks;
  83. extern unsigned char *dictionary_header_block;
  84. extern unsigned char *dictionary_block;
  85.  
  86. void
  87. printIndexUsingDictionary(db)
  88. database* db;
  89. /* use the dictionary to go over the index */
  90. {
  91.   /* prints the contents of a dictionary */
  92.   FILE *dictStream = db->dictionary_stream;
  93.   long i;
  94.   long new_number_of_dictionary_blocks;
  95.   serialPostingFile* spf = NULL;
  96.   char indexFileName[MAX_FILE_NAME_LEN + 1];
  97.  
  98.   spf = initSerialPostingFile(index_filename(indexFileName,db));
  99.  
  100.   if(NULL == dictStream)
  101.     panic("dictionary dictStream is not open");
  102.   s_fseek(dictStream, 0L, SEEK_SET);
  103.   new_number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE, dictStream);
  104.   if(new_number_of_dictionary_blocks > number_of_dictionary_blocks)
  105.     dictionary_header_block = NULL;
  106.   number_of_dictionary_blocks = new_number_of_dictionary_blocks;
  107.   printf("Number of dictionary blocks %ld\n", number_of_dictionary_blocks);
  108.   if(NULL == (dictionary_header_block =
  109.           read_dictionary_block(dictionary_header_block,
  110.                     DICTIONARY_HEADER_SIZE,
  111.                     number_of_dictionary_blocks,
  112.                     dictStream)))
  113.     panic("Could not read dictionary header block");
  114.   printf("The Dictionary Header Block:\n");
  115.   print_dictionary_block(dictionary_header_block, number_of_dictionary_blocks);
  116.   for(i = 0; i < number_of_dictionary_blocks; i++)
  117.   {
  118.     long pos = dictionary_block_position(i, dictionary_header_block);
  119.     if(NULL == (dictionary_block =
  120.         read_dictionary_block(dictionary_block,
  121.                       pos, DICTIONARY_BLOCK_SIZE, dictStream)))
  122.       panic("Could not read dictionary block %ld", pos);
  123.     printf("\n\nDictionary block %ld (position %ld):\n", i, pos);
  124.     print_dictionary_block_and_index(dictionary_block, DICTIONARY_BLOCK_SIZE,spf);
  125.   }
  126.   fseek(dictStream, 0L, SEEK_END);
  127.   disposeSerialPostingFile(spf);
  128. }
  129.  
  130. /*---------------------------------------------------------------------------*/
  131.  
  132. serialPostingFile*
  133. initSerialPostingFile(filename)
  134. char* filename;
  135. /* open an inverted index file create by irn8. return a structure
  136.    maintaining its state
  137.  */
  138. {
  139.   FILE* stream = NULL;
  140.   serialPostingFile* pf = NULL;
  141.  
  142.   stream = s_fopen(filename,"rb");
  143.   if (stream == NULL) /* can't open that file */
  144.     return(NULL);
  145.   s_fseek(stream,INDEX_HEADER_SIZE,SEEK_SET);
  146.  
  147.   pf = (serialPostingFile*)s_malloc((size_t)sizeof(serialPostingFile));
  148.   pf->stream = stream;
  149.   pf->length = file_length(stream);
  150.   pf->current_index_block = INDEX_HEADER_SIZE;
  151.  
  152.   return(pf);
  153. }
  154.  
  155. /*---------------------------------------------------------------------------*/
  156.  
  157. void
  158. disposeSerialPostingFile(pf)
  159. serialPostingFile* pf;
  160. {
  161.   s_fclose(pf->stream);
  162.   s_free(pf);
  163. }
  164.  
  165. /*---------------------------------------------------------------------------*/
  166.  
  167. void 
  168. printPostingsForATerm(pfat)
  169. postingsForATerm* pfat;
  170. {
  171.   long i;
  172.  
  173.   if (pfat->word[0] != '\0')
  174.     printf("word '%s'\n",pfat->word);
  175.     
  176.   for (i = 0; i < pfat->entries; i++)
  177.     printf("\tdoc %ld weight %ld\n",pfat->docs[i],pfat->weights[i]);
  178. }
  179.  
  180. /*---------------------------------------------------------------------------*/
  181.  
  182. postingsForATerm*
  183. getPostingsAt(spf,position)
  184. serialPostingFile* spf;
  185. long position;
  186. /* position better be a valid starting position! */
  187. {
  188.   fseek(spf->stream,position,SEEK_SET);
  189.   spf->current_index_block = position;
  190.   return(getPostingsForNextTerm(spf));
  191. }
  192.  
  193. /*---------------------------------------------------------------------------*/
  194.  
  195. void
  196. disposePostingsForATerm(pfat)
  197. postingsForATerm* pfat;
  198. {
  199.   s_free(pfat->docs);
  200.   s_free(pfat->weights);
  201.   s_free(pfat);
  202. }
  203.  
  204. /*---------------------------------------------------------------------------*/
  205.  
  206. void 
  207. removePostings(pfat,start,run)
  208. postingsForATerm* pfat;
  209. long start;
  210. long run;
  211. /* remove postings start through start + run from the pfat */
  212. {
  213.   void* toPtr = NULL;
  214.   long runLen;
  215.   long toMove;
  216.  
  217.   if (start + run > pfat->entries)
  218.     return; /* this is an error */
  219.  
  220.   toPtr = (void*)(pfat->docs + (start * sizeof(docID)));
  221.   runLen = run * sizeof(docID);
  222.   toMove = ((pfat->entries - start) * sizeof(docID)) - runLen;
  223.   memmove(toPtr,toPtr + runLen,toMove);
  224.  
  225.   toPtr = (void*)(pfat->weights + (start * sizeof(postingWeight)));
  226.   runLen = run * sizeof(docID);
  227.   toMove = ((pfat->entries - start) * sizeof(postingWeight)) - runLen;
  228.   memmove(toPtr,toPtr + runLen,toMove);
  229.  
  230.   pfat->entries -= run;
  231. }
  232.  
  233. /*---------------------------------------------------------------------------*/
  234.  
  235. void
  236. readPostings(spf,posts,not_full_flag)
  237. serialPostingFile* spf;
  238. postingsForATerm* posts;
  239. long not_full_flag;
  240. {
  241.   long count;
  242.   long document_id,weight,number_of_valid_entries;
  243.   long index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
  244.   long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
  245.   
  246.   if (EOF == index_block_size) 
  247.    { fprintf(stderr,"reading from the index file failed\n");
  248.      return;
  249.    }
  250.       
  251.   if (not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG) /* not full */
  252.     number_of_valid_entries = index_block / INDEX_ELEMENT_SIZE;
  253.  
  254.   else if (not_full_flag == INDEX_BLOCK_FULL_FLAG) /* full */
  255.    { number_of_valid_entries = 
  256.        (index_block_size - INDEX_BLOCK_HEADER_SIZE) / INDEX_ELEMENT_SIZE;
  257.    }
  258.  
  259.   else
  260.    { /* bad news,file is corrupted.  this should return error
  261.     code rather than panicing XXX */
  262.      panic("Expected the flag in the inverted file to be valid.  it is %lx",
  263.        not_full_flag);
  264.    }
  265.  
  266.   posts->docs = 
  267.     (docID*)s_malloc((size_t)(sizeof(docID) * number_of_valid_entries));
  268.   posts->weights = 
  269.     (postingWeight*)s_malloc((size_t)(sizeof(postingWeight) * 
  270.                       number_of_valid_entries));
  271.  
  272.   for (count = 0; count < number_of_valid_entries; count++)
  273.    { long val;
  274.      posts->docs[count] = read_bytes(DOCUMENT_ID_SIZE,spf->stream);
  275.      s_fseek(spf->stream,WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,SEEK_CUR);
  276.      val = read_bytes(WEIGHT_SIZE,spf->stream);
  277.      if(EOF == val)
  278.       { fprintf(stderr,"reading from the inverted file failed\n");
  279.         return;
  280.       }
  281.      else
  282.       { posts->weights[count] = val;
  283.         posts->entries++;
  284.       } 
  285.    }
  286. }
  287.  
  288. /*---------------------------------------------------------------------------*/
  289.  
  290. long
  291. readDictionaryIndexBlock(number_of_occurances,word,stream)
  292. long *number_of_occurances;
  293. char *word;
  294. FILE *stream;
  295. /* NOTE - similar to read_dictionary_index_lock */
  296. {
  297.   /* this reads the dictionary index block from the index stream.
  298.      It assumes the stream is positioned at the right after the flag
  299.      returns 0 if it succeeds.
  300.      returns -1 if it is at the end of a file.
  301.      returns -2 if it read something strange.
  302.      Always sets word length to 0 if it fails. */
  303.  
  304.   char temp[MAX_WORD_LENGTH + 2];
  305.   
  306.   word[0] = '\0';
  307.  
  308.   s_fseek(stream,NEXT_INDEX_BLOCK_SIZE+INDEX_BLOCK_SIZE_SIZE,SEEK_CUR);
  309.   *number_of_occurances = read_bytes(NUMBER_OF_OCCURANCES_SIZE,stream);
  310.   fgets(temp,MAX_WORD_LENGTH + 2,stream); /* 2 is for the \n and '\0' */
  311.  
  312.   /* trim the \n */
  313.   if(temp[strlen(temp) - 1] == '\n'){
  314.     temp[strlen(temp) - 1] = '\0';
  315.   }
  316.   strcpy(word, temp);
  317.  
  318.   return(0);
  319. }
  320.  
  321. /*---------------------------------------------------------------------------*/
  322.  
  323. postingsForATerm*
  324. getPostingsForNextTerm(spf)
  325. serialPostingFile* spf;
  326. {
  327.   postingsForATerm* posts = NULL;
  328.   
  329.   posts = (postingsForATerm*)s_malloc((size_t)sizeof(postingsForATerm));
  330.   posts->word[0] = '\0';
  331.   posts->entries = 0;
  332.  
  333.   /* this is really a 2 step process - read the dictonary block, then read
  334.      the postings.  I don't see any reason to unwrap it though */
  335.   while (true)
  336.    { 
  337.      long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
  338.  
  339.      if (flag == EOF)
  340.       { return(NULL);
  341.       }
  342.  
  343.      if (flag == INDEX_BLOCK_DICTIONARY_FLAG) /* the dictionary entry */
  344.       { /* read the dictionary part */
  345.     long number_of_occurances;
  346.     if (readDictionaryIndexBlock(&number_of_occurances,
  347.                      posts->word,spf->stream) < 0)
  348.       panic("read dictionary index block failed");
  349.       }
  350.      else /* the posting entry */
  351.       { readPostings(spf,posts,flag);
  352.     break;
  353.       }
  354.    }
  355.  
  356.   return(posts);
  357. }
  358.  
  359. /*---------------------------------------------------------------------------*/
  360.  
  361.  
  362.  
  363.  
  364. #ifdef old
  365.  
  366. these routines are slower thatn the current ones, keep them around for a 
  367. while until we are sure the new ones work ok
  368.  
  369. /*---------------------------------------------------------------------------*/
  370.  
  371. postingsForATerm*
  372. getPostingsForNextTerm(spf)
  373. serialPostingFile* spf;
  374. {
  375.   postingsForATerm* slow;
  376.   postingsForATerm* fast;
  377. /*
  378.   long filePos = s_ftell(spf->stream);
  379.   slow = getPostingsForNextTermSLOW(spf);
  380.   printf("SLOW:\n");
  381.   printPostingsForATerm(slow);NL();
  382.   s_fseek(spf->stream,filePos,SEEK_SET);
  383. */
  384.   fast = getPostingsForNextTermFAST(spf);
  385. /*  printf("FAST:\n");
  386.   printPostingsForATerm(fast);NL();
  387.   disposePostingsForATerm(slow);
  388. */
  389.   return(fast);
  390. }
  391.  
  392. /*---------------------------------------------------------------------------*/
  393.  
  394. postingsForATerm*
  395. getPostingsForNextTermSLOW(spf)
  396. serialPostingFile* spf;
  397. {
  398.   postingsForATerm* posts = NULL;
  399.   boolean keepGoing = true;
  400.   
  401.   if (spf->current_index_block >= spf->length)
  402.     return(NULL);
  403.  
  404.   posts = (postingsForATerm*)s_malloc((size_t)sizeof(postingsForATerm));
  405.   posts->word[0] = '\0';
  406.   posts->entries = 0;
  407.  
  408.   while (keepGoing) 
  409.    { 
  410.      long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
  411.      long next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
  412.      long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
  413.  
  414.      if (flag == INDEX_BLOCK_DICTIONARY_FLAG)
  415.        { long last_index_block;
  416.      long index_block_size;
  417.      long number_of_occurances;
  418.      char word[MAX_WORD_LENGTH + 1];
  419.      if (0 > read_dictionary_index_block(spf->current_index_block,
  420.                          &last_index_block,
  421.                          &index_block_size,
  422.                          &number_of_occurances,
  423.                          word,
  424.                          spf->stream))
  425.        panic("read dictionary index block failed");
  426.      cprintf(TEST_READ,
  427.          "%ld: size %3ld word '%s',occurances %ld last block %ld\n",
  428.          spf->current_index_block,index_block_size,word,
  429.          number_of_occurances,next_index_block);
  430.      strcpy(posts->word,word);
  431.        }
  432.  
  433.      else if (flag == INDEX_BLOCK_NOT_FULL_FLAG)
  434.        { cprintf(TEST_READ,"%ld: size %3ld Not full,valid entries %ld\n",
  435.          spf->current_index_block,index_block_size,next_index_block);
  436.      readPostings(spf,posts);
  437.      keepGoing = false;
  438.        }
  439.  
  440.      else if (flag == INDEX_BLOCK_FULL_FLAG)
  441.        { cprintf(TEST_READ,"%ld: size %3ld full block,next block %ld\n",
  442.          spf->current_index_block,index_block_size,next_index_block);
  443.      readPostings(spf,posts);
  444.      keepGoing = false;
  445.        }
  446.  
  447.      else 
  448.        panic("bad entry %ld (ftell %ld),flag was %ld",
  449.          spf->current_index_block,ftell(spf->stream),flag);
  450.  
  451.      spf->current_index_block += index_block_size;
  452.      s_fseek(spf->stream,spf->current_index_block,SEEK_SET);
  453.    }
  454.  
  455.   return(posts);
  456. }
  457.  
  458. /*---------------------------------------------------------------------------*/
  459.  
  460. void
  461. readPostings(spf,posts)
  462. serialPostingFile* spf;
  463. postingsForATerm* posts;
  464. {
  465.   long not_full_flag = INDEX_BLOCK_FULL_FLAG;
  466.   long count,index_block_size;
  467.   long document_id,weight,number_of_valid_entries;
  468.   long index_block = spf->current_index_block;
  469.   
  470.   if (index_block >= 0)
  471.     {
  472.       /* read the index block */
  473.       if (0 != fseek(spf->stream,(long)index_block,SEEK_SET))
  474.     { 
  475.       fprintf(stderr,
  476.           "fseek failed into the inverted file to position %ld\n",
  477.           (long)index_block); 
  478.       return;
  479.     }
  480.       
  481.       not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
  482.       index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
  483.       index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
  484.       if (EOF == index_block_size) 
  485.     { fprintf(stderr,"reading from the index file failed\n");
  486.       return;
  487.     }
  488.       
  489.       if (not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG)
  490.     { /* not full */
  491.       number_of_valid_entries = index_block;
  492.     }
  493.       else if (not_full_flag == INDEX_BLOCK_FULL_FLAG)
  494.     { /* full */
  495.       number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
  496.     }
  497.       else
  498.     { /* bad news,file is corrupted.  this should return error
  499.          code rather than panicing XXX */
  500.     panic("Expected the flag in the inverted file to be valid.  it is %ld",
  501.           not_full_flag);
  502.         }
  503.  
  504.       cprintf(TEST_READ,"  number of valid bytes: %ld\n",
  505.           number_of_valid_entries);
  506.       
  507.       for (count = 0; count < number_of_valid_entries; 
  508.        count = count + INDEX_ELEMENT_SIZE)
  509.     {
  510.       document_id = read_bytes(DOCUMENT_ID_SIZE,spf->stream);
  511.       (void)read_bytes(WORD_POSITION_SIZE,spf->stream);
  512.       (void)read_bytes(CHARACTER_POSITION_SIZE,spf->stream);
  513.       weight = read_bytes(WEIGHT_SIZE,spf->stream);
  514.       cprintf(TEST_READ,"    entry %ld,Doc_id: %ld,weight %ld\n",
  515.           count % INDEX_ELEMENT_SIZE,document_id,weight);
  516.       if(EOF == weight) 
  517.         { fprintf(stderr,"reading from the doc-id table failed\n");
  518.           return;
  519.         }
  520.       posts->entries++;
  521.       posts->docs = (docID*)s_realloc(posts->docs,
  522.                   (size_t)(sizeof(docID) * posts->entries));
  523.       posts->docs[posts->entries - 1] = document_id;
  524.       posts->weights = (postingWeight*)s_realloc(posts->weights,
  525.                      (size_t)(sizeof(postingWeight) * 
  526.                           posts->entries));
  527.       posts->weights[posts->entries - 1] = weight;
  528.     }
  529.     }
  530. }
  531.  
  532. /*---------------------------------------------------------------------------*/
  533.  
  534. #endif /* ndef old */
  535.